Sys.Date()
## [1] "2020-12-18"
Authors: Hanna Klimczak, Kamil Pluciński
library(plotly)
library(knitr)
library(kableExtra)
library(caret)
To ensure that running the notebook will always return the same output, we set seed to 0.
set.seed(0)
data <- read.csv("Life_Expectancy_Data.csv")
kable(head(data), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
| Country | Year | Status | Life.expectancy | Adult.Mortality | infant.deaths | Alcohol | percentage.expenditure | Hepatitis.B | Measles | BMI | under.five.deaths | Polio | Total.expenditure | Diphtheria | HIV.AIDS | GDP | Population | thinness..1.19.years | thinness.5.9.years | Income.composition.of.resources | Schooling |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Afghanistan | 2015 | Developing | 65.0 | 263 | 62 | 0.01 | 71.279624 | 65 | 1154 | 19.1 | 83 | 6 | 8.16 | 65 | 0.1 | 584.25921 | 33736494 | 17.2 | 17.3 | 0.479 | 10.1 |
| Afghanistan | 2014 | Developing | 59.9 | 271 | 64 | 0.01 | 73.523582 | 62 | 492 | 18.6 | 86 | 58 | 8.18 | 62 | 0.1 | 612.69651 | 327582 | 17.5 | 17.5 | 0.476 | 10.0 |
| Afghanistan | 2013 | Developing | 59.9 | 268 | 66 | 0.01 | 73.219243 | 64 | 430 | 18.1 | 89 | 62 | 8.13 | 64 | 0.1 | 631.74498 | 31731688 | 17.7 | 17.7 | 0.470 | 9.9 |
| Afghanistan | 2012 | Developing | 59.5 | 272 | 69 | 0.01 | 78.184215 | 67 | 2787 | 17.6 | 93 | 67 | 8.52 | 67 | 0.1 | 669.95900 | 3696958 | 17.9 | 18.0 | 0.463 | 9.8 |
| Afghanistan | 2011 | Developing | 59.2 | 275 | 71 | 0.01 | 7.097109 | 68 | 3013 | 17.2 | 97 | 68 | 7.87 | 68 | 0.1 | 63.53723 | 2978599 | 18.2 | 18.2 | 0.454 | 9.5 |
| Afghanistan | 2010 | Developing | 58.8 | 279 | 74 | 0.01 | 79.679367 | 66 | 1989 | 16.7 | 102 | 66 | 9.20 | 66 | 0.1 | 553.32894 | 2883167 | 18.4 | 18.4 | 0.448 | 9.2 |
nrow(data)
## [1] 2938
kable(summary(data), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
| Country | Year | Status | Life.expectancy | Adult.Mortality | infant.deaths | Alcohol | percentage.expenditure | Hepatitis.B | Measles | BMI | under.five.deaths | Polio | Total.expenditure | Diphtheria | HIV.AIDS | GDP | Population | thinness..1.19.years | thinness.5.9.years | Income.composition.of.resources | Schooling | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Length:2938 | Min. :2000 | Length:2938 | Min. :36.30 | Min. : 1.0 | Min. : 0.0 | Min. : 0.0100 | Min. : 0.000 | Min. : 1.00 | Min. : 0.0 | Min. : 1.00 | Min. : 0.00 | Min. : 3.00 | Min. : 0.370 | Min. : 2.00 | Min. : 0.100 | Min. : 1.68 | Min. :3.400e+01 | Min. : 0.10 | Min. : 0.10 | Min. :0.0000 | Min. : 0.00 | |
| Class :character | 1st Qu.:2004 | Class :character | 1st Qu.:63.10 | 1st Qu.: 74.0 | 1st Qu.: 0.0 | 1st Qu.: 0.8775 | 1st Qu.: 4.685 | 1st Qu.:77.00 | 1st Qu.: 0.0 | 1st Qu.:19.30 | 1st Qu.: 0.00 | 1st Qu.:78.00 | 1st Qu.: 4.260 | 1st Qu.:78.00 | 1st Qu.: 0.100 | 1st Qu.: 463.94 | 1st Qu.:1.958e+05 | 1st Qu.: 1.60 | 1st Qu.: 1.50 | 1st Qu.:0.4930 | 1st Qu.:10.10 | |
| Mode :character | Median :2008 | Mode :character | Median :72.10 | Median :144.0 | Median : 3.0 | Median : 3.7550 | Median : 64.913 | Median :92.00 | Median : 17.0 | Median :43.50 | Median : 4.00 | Median :93.00 | Median : 5.755 | Median :93.00 | Median : 0.100 | Median : 1766.95 | Median :1.387e+06 | Median : 3.30 | Median : 3.30 | Median :0.6770 | Median :12.30 | |
| NA | Mean :2008 | NA | Mean :69.22 | Mean :164.8 | Mean : 30.3 | Mean : 4.6029 | Mean : 738.251 | Mean :80.94 | Mean : 2419.6 | Mean :38.32 | Mean : 42.04 | Mean :82.55 | Mean : 5.938 | Mean :82.32 | Mean : 1.742 | Mean : 7483.16 | Mean :1.275e+07 | Mean : 4.84 | Mean : 4.87 | Mean :0.6276 | Mean :11.99 | |
| NA | 3rd Qu.:2012 | NA | 3rd Qu.:75.70 | 3rd Qu.:228.0 | 3rd Qu.: 22.0 | 3rd Qu.: 7.7025 | 3rd Qu.: 441.534 | 3rd Qu.:97.00 | 3rd Qu.: 360.2 | 3rd Qu.:56.20 | 3rd Qu.: 28.00 | 3rd Qu.:97.00 | 3rd Qu.: 7.492 | 3rd Qu.:97.00 | 3rd Qu.: 0.800 | 3rd Qu.: 5910.81 | 3rd Qu.:7.420e+06 | 3rd Qu.: 7.20 | 3rd Qu.: 7.20 | 3rd Qu.:0.7790 | 3rd Qu.:14.30 | |
| NA | Max. :2015 | NA | Max. :89.00 | Max. :723.0 | Max. :1800.0 | Max. :17.8700 | Max. :19479.912 | Max. :99.00 | Max. :212183.0 | Max. :87.30 | Max. :2500.00 | Max. :99.00 | Max. :17.600 | Max. :99.00 | Max. :50.600 | Max. :119172.74 | Max. :1.294e+09 | Max. :27.70 | Max. :28.60 | Max. :0.9480 | Max. :20.70 | |
| NA | NA | NA | NA’s :10 | NA’s :10 | NA | NA’s :194 | NA | NA’s :553 | NA | NA’s :34 | NA | NA’s :19 | NA’s :226 | NA’s :19 | NA | NA’s :448 | NA’s :652 | NA’s :34 | NA’s :34 | NA’s :167 | NA’s :163 |
As we can see from the summary above, there are some missing values in the dataset. Due to the fact that life.expectancy is the most important attribute for our analysis, we have decided to remove all rows where life.expectancy is NA.
data_new <- data[complete.cases(data[ , 'Life.expectancy']),]
After this operation, we still encounter missing values in the following columns: “Alcohol”, “Hepatitis.B”, “BMI”, “Polio”, “Total.expenditure”, “Diphtheria”, “GDP”, “Population”, “thinness..1.19.years”, “thinness.5.9.years”, “Income.composition.of.resources”, “Schooling”. We will fill these NA values with median for each column.
na_columns <- list("Alcohol", "Hepatitis.B", "BMI", "Polio", "Total.expenditure", "Diphtheria", "GDP", "Population", "thinness..1.19.years", "thinness.5.9.years", "Income.composition.of.resources", "Schooling")
for (col in na_columns){
m <- median(data_new[ , col], na.rm=TRUE)
print(col)
print(m)
data_new[ , col][is.na(data_new[ , col])] <- m
}
## [1] "Alcohol"
## [1] 3.77
## [1] "Hepatitis.B"
## [1] 92
## [1] "BMI"
## [1] 43.35
## [1] "Polio"
## [1] 93
## [1] "Total.expenditure"
## [1] 5.75
## [1] "Diphtheria"
## [1] 93
## [1] "GDP"
## [1] 1764.974
## [1] "Population"
## [1] 1391757
## [1] "thinness..1.19.years"
## [1] 3.3
## [1] "thinness.5.9.years"
## [1] 3.4
## [1] "Income.composition.of.resources"
## [1] 0.677
## [1] "Schooling"
## [1] 12.3
summary(data_new)
## Country Year Status Life.expectancy
## Length:2928 Min. :2000 Length:2928 Min. :36.30
## Class :character 1st Qu.:2004 Class :character 1st Qu.:63.10
## Mode :character Median :2008 Mode :character Median :72.10
## Mean :2008 Mean :69.22
## 3rd Qu.:2011 3rd Qu.:75.70
## Max. :2015 Max. :89.00
## Adult.Mortality infant.deaths Alcohol percentage.expenditure
## Min. : 1.0 Min. : 0.00 Min. : 0.010 Min. : 0.000
## 1st Qu.: 74.0 1st Qu.: 0.00 1st Qu.: 1.107 1st Qu.: 4.854
## Median :144.0 Median : 3.00 Median : 3.770 Median : 65.611
## Mean :164.8 Mean : 30.41 Mean : 4.559 Mean : 740.321
## 3rd Qu.:228.0 3rd Qu.: 22.00 3rd Qu.: 7.400 3rd Qu.: 442.614
## Max. :723.0 Max. :1800.00 Max. :17.870 Max. :19479.912
## Hepatitis.B Measles BMI under.five.deaths
## Min. : 1.00 Min. : 0.0 Min. : 1.00 Min. : 0.00
## 1st Qu.:82.00 1st Qu.: 0.0 1st Qu.:19.40 1st Qu.: 0.00
## Median :92.00 Median : 17.0 Median :43.35 Median : 4.00
## Mean :83.05 Mean : 2427.9 Mean :38.29 Mean : 42.18
## 3rd Qu.:96.00 3rd Qu.: 362.2 3rd Qu.:56.10 3rd Qu.: 28.00
## Max. :99.00 Max. :212183.0 Max. :77.60 Max. :2500.00
## Polio Total.expenditure Diphtheria HIV.AIDS
## Min. : 3.00 Min. : 0.370 Min. : 2.00 Min. : 0.100
## 1st Qu.:78.00 1st Qu.: 4.370 1st Qu.:78.00 1st Qu.: 0.100
## Median :93.00 Median : 5.750 Median :93.00 Median : 0.100
## Mean :82.62 Mean : 5.916 Mean :82.39 Mean : 1.748
## 3rd Qu.:97.00 3rd Qu.: 7.330 3rd Qu.:97.00 3rd Qu.: 0.800
## Max. :99.00 Max. :17.600 Max. :99.00 Max. :50.600
## GDP Population thinness..1.19.years
## Min. : 1.68 Min. :3.400e+01 Min. : 0.100
## 1st Qu.: 578.80 1st Qu.:4.181e+05 1st Qu.: 1.600
## Median : 1764.97 Median :1.392e+06 Median : 3.300
## Mean : 6627.39 Mean :1.026e+07 Mean : 4.834
## 3rd Qu.: 4793.63 3rd Qu.:4.593e+06 3rd Qu.: 7.100
## Max. :119172.74 Max. :1.294e+09 Max. :27.700
## thinness.5.9.years Income.composition.of.resources Schooling
## Min. : 0.100 Min. :0.0000 Min. : 0.00
## 1st Qu.: 1.600 1st Qu.:0.5040 1st Qu.:10.30
## Median : 3.400 Median :0.6770 Median :12.30
## Mean : 4.865 Mean :0.6301 Mean :12.02
## 3rd Qu.: 7.200 3rd Qu.:0.7730 3rd Qu.:14.10
## Max. :28.600 Max. :0.9480 Max. :20.70
As we can see, we have successfully dealt with missing values.
names(data_new)
## [1] "Country" "Year"
## [3] "Status" "Life.expectancy"
## [5] "Adult.Mortality" "infant.deaths"
## [7] "Alcohol" "percentage.expenditure"
## [9] "Hepatitis.B" "Measles"
## [11] "BMI" "under.five.deaths"
## [13] "Polio" "Total.expenditure"
## [15] "Diphtheria" "HIV.AIDS"
## [17] "GDP" "Population"
## [19] "thinness..1.19.years" "thinness.5.9.years"
## [21] "Income.composition.of.resources" "Schooling"
quantitive_cols = c("Year", "Life.expectancy", "Adult.Mortality", "infant.deaths", "Alcohol", "percentage.expenditure", "Hepatitis.B", "Measles", "BMI", "under.five.deaths", "Polio", "Total.expenditure", "Diphtheria","HIV.AIDS","GDP","Population", "thinness..1.19.years","thinness.5.9.years", "Income.composition.of.resources", "Schooling")
l <- htmltools::tagList()
i <- 1
for (col in quantitive_cols){
l[[i]] <- plot_ly(y = data_new[, col], type = "box",name=col, quartilemethod="exclusive")
i <- i + 1
}
l
p1 <- plot_ly(data_new, x = ~Status) %>%
add_histogram()
p1
correlation_data <- data_new[quantitive_cols]
fig <- plot_ly(
x = quantitive_cols,
y = quantitive_cols,
z = cor(correlation_data), type = "heatmap"
)
fig
The above chart gives us important information about the correlation between variables. As we can see, there is almost perfect correlation between thinness.1.19.years and thinness.5.9.years. There also appears to be a very strong correlation between GDP and percentage.expenditure (0.9), as well as infant.deaths and under.five.deaths (0.99). Naturally, we can also see a strong negative correlation between Adult.Mortality and Life.expectancy (-0.69). Schooling and life expectancy seem to be slightly correlated as well (0.71).
For our prediction, we will need to drop some of the features that are highly correlated. We will drop thinness.5.9.years, percentage.expenditure and infant.deaths, as their correlation to our decision variable is weaker than features correlated to them.
The following plot is fully interactive - hover over selected line to see a specific value, select one country by double clicking on it in the legend. If you want to return to the view of all countries - double click on the legend again.
countries <- unique(data_new$Country)
fig <- plot_ly(data, x = data_new[data_new$Country == 'Afghanistan', 'Year'])
for(name in countries){
fig <- fig %>% add_trace(y = data_new[data_new$Country == name, 'Life.expectancy'], name = name, type = 'scatter', mode = 'lines')
}
fig <- fig %>% layout(legend = list(orientation = 'h'))
fig
regression_data <- data_new[, !names(data_new) %in% c("Country", "thinness.5.9.years", "percentage.expenditure", "infant.deaths", "Year")]
kable(head(regression_data), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
| Status | Life.expectancy | Adult.Mortality | Alcohol | Hepatitis.B | Measles | BMI | under.five.deaths | Polio | Total.expenditure | Diphtheria | HIV.AIDS | GDP | Population | thinness..1.19.years | Income.composition.of.resources | Schooling |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Developing | 65.0 | 263 | 0.01 | 65 | 1154 | 19.1 | 83 | 6 | 8.16 | 65 | 0.1 | 584.25921 | 33736494 | 17.2 | 0.479 | 10.1 |
| Developing | 59.9 | 271 | 0.01 | 62 | 492 | 18.6 | 86 | 58 | 8.18 | 62 | 0.1 | 612.69651 | 327582 | 17.5 | 0.476 | 10.0 |
| Developing | 59.9 | 268 | 0.01 | 64 | 430 | 18.1 | 89 | 62 | 8.13 | 64 | 0.1 | 631.74498 | 31731688 | 17.7 | 0.470 | 9.9 |
| Developing | 59.5 | 272 | 0.01 | 67 | 2787 | 17.6 | 93 | 67 | 8.52 | 67 | 0.1 | 669.95900 | 3696958 | 17.9 | 0.463 | 9.8 |
| Developing | 59.2 | 275 | 0.01 | 68 | 3013 | 17.2 | 97 | 68 | 7.87 | 68 | 0.1 | 63.53723 | 2978599 | 18.2 | 0.454 | 9.5 |
| Developing | 58.8 | 279 | 0.01 | 66 | 1989 | 16.7 | 102 | 66 | 9.20 | 66 | 0.1 | 553.32894 | 2883167 | 18.4 | 0.448 | 9.2 |
Train test valid split
trainval_partition <-
createDataPartition(
y = regression_data$Life.expectancy,
p = .8,
list = FALSE)
trainval_data <- regression_data[ trainval_partition,]
test_data <- regression_data[-trainval_partition,]
train_partition <-
createDataPartition(
y = trainval_data$Life.expectancy,
p = .8,
list = FALSE)
train_data <- trainval_data[ train_partition,]
val_data <- trainval_data[-train_partition,]
control <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 5)
linear <- train(Life.expectancy ~ .,
data = train_data,
trControl = control,
preProcess = c('scale', 'center'),
method = "lm")
linear
## Linear Regression
##
## 1878 samples
## 16 predictor
##
## Pre-processing: scaled (16), centered (16)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 1690, 1690, 1690, 1690, 1690, 1690, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.232504 0.8022808 3.159272
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
lasso <- train(Life.expectancy ~ .,
data = train_data,
trControl = control,
preProcess = c('scale', 'center'),
method = "lasso")
lasso
## The lasso
##
## 1878 samples
## 16 predictor
##
## Pre-processing: scaled (16), centered (16)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 1690, 1690, 1689, 1690, 1688, 1691, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.1 8.534207 0.6981532 6.941555
## 0.5 5.498991 0.7556058 4.146438
## 0.9 4.251140 0.8009618 3.155516
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.9.
ridge <- train(Life.expectancy ~ .,
data = train_data,
trControl = control,
preProcess = c('scale', 'center'),
method = "ridge")
ridge
## Ridge Regression
##
## 1878 samples
## 16 predictor
##
## Pre-processing: scaled (16), centered (16)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 1689, 1691, 1691, 1689, 1689, 1690, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0e+00 4.228416 0.8031364 3.159813
## 1e-04 4.228416 0.8031369 3.159841
## 1e-01 4.275729 0.8030123 3.230414
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 1e-04.
ggplot(varImp(ridge))